Load preprocessed dataset (preprocessing code in data_preprocessing.Rmd)

glue('Number of genes: ', nrow(datExpr), '\n',
     'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
     sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 30107
## Number of samples: 80 (45 ASD, 35 controls)

Filtering DE Genes

Significance criteria: adjusted p-value<0.05

datExpr = datExpr %>% filter(DE_info$padj<0.05 & !is.na(DE_info$padj))
print(paste0(nrow(datExpr), ' genes left.'))
## [1] "3000 genes left."

Dimensionality reduction using PCA

Since there are more genes than samples, we can perform PCA and reduce the dimension from 30K to 80 without losing any information and use this for methods that take too long with the whole dataset

datExpr_t = datExpr %>% t
pca = datExpr_t %>% prcomp
datExpr_redDim = pca$x %>% data.frame

Clustering Methods

clusterings = list()



K-means clustering

Chose k=3 as best number of clusters

set.seed(123)
wss = sapply(1:10, function(k) kmeans(datExpr_t, k, nstart=25)$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 3
abline(v = best_k, col='blue')

datExpr_k_means = kmeans(datExpr_t, best_k, nstart=25)
clusterings[['km']] = datExpr_k_means$cluster



Hierarchical Clustering

Chose k=7 as best number of clusters.

h_clusts = datExpr_t %>% dist %>% hclust %>% as.dendrogram
h_clusts %>% plot
abline(h=25, col='blue')

best_k = 7

Clusters seem to be able to separate ASD and control samples pretty well and there are no noticeable patterns regarding sex, age or brain region in any cluster.

Colors:

  • Diagnosis: Blue=control, Green=ASD

  • Sex: Pink=Female, Blue=Male

  • Brain region: Pink=Frontal, Green=Temporal, Blue=Parietal, Purple=Occipital

  • Age: Purple=youngest, Yellow=oldest

clusterings[['hc']] = cutree(h_clusts, best_k)

create_viridis_dict = function(){
  min_age = datMeta$Age %>% min
  max_age = datMeta$Age %>% max
  viridis_age_cols = viridis(max_age - min_age + 1)
  names(viridis_age_cols) = seq(min_age, max_age)
  
  return(viridis_age_cols)
}
viridis_age_cols = create_viridis_dict()

dend_meta = datMeta[match(labels(h_clusts), rownames(datMeta)),] %>% 
            mutate('Diagnosis' = ifelse(Diagnosis_=='CTL','#008080','#86b300'), # Blue control, Green ASD
                   'Sex' = ifelse(Sex=='F','#ff6666','#008ae6'),                # Pink Female, Blue Male
                   'Region' = case_when(Brain_lobe=='Frontal'~'#F8766D',        # ggplot defaults for 4 colours
                                        Brain_lobe=='Temporal'~'#7CAE00',
                                        Brain_lobe=='Parietal'~'#00BFC4',
                                        Brain_lobe=='Occipital'~'#C77CFF'),
                   'Age' = viridis_age_cols[as.character(Age)]) %>%            # Purple: young, Yellow: old
            dplyr::select(Age, Region, Sex, Diagnosis)
h_clusts %>% set('labels', rep('', nrow(datMeta))) %>% set('branches_k_color', k=best_k) %>% plot
colored_bars(colors=dend_meta)



Consensus Clustering

Chose the best clustering to be with k=8, which is basically two big clusters and some outliers

*The rest of the output plots can be found in the Data/Gandal/consensusClustering/samples_DE_genes folder



Independent Component Analysis

Following this paper’s guidelines:

  1. Run PCA and keep enough components to explain 60% of the variance (The 2 first components explain over 60% of the variance, so decided to keep the first 37 to accumulate 90% of the variance)

  2. Run ICA with that same number of nbComp as principal components kept to then filter them

  3. Select components with kurtosis > 3

  4. Assign obs to genes with FDR<0.01 using the fdrtool package

Note: Using the PCA reduced matrix because the algorithm didn’t converge with the original dataset


It’s not supposed to be a good method for clustering samples because ICA does not perform well with small samples (see Figure 4 of this paper)

Still, it leaves only 7 samples without a cluster

ICA_clusters %>% rowSums %>% table
## .
##  0  1  2  3  4  6 
##  7 36 23 10  3  1
# ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2,Var1)) + 
#   geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') + 
#   theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()



WGCNA

This method doesn’t work well:

Using power=5 as it is the first power that exceed the 0.85 threshold and the reduced version of the dataset because the original one never reaches the \(R^2\) threshold

best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = 1:30)
##    Power SFT.R.sq  slope truncated.R.sq  mean.k. median.k.   max.k.
## 1      1   0.0873 -0.323         0.0532 1.60e+01  1.56e+01 2.64e+01
## 2      2   0.5580 -0.912         0.7120 4.65e+00  4.03e+00 1.11e+01
## 3      3   0.6670 -1.060         0.6750 1.64e+00  1.18e+00 5.18e+00
## 4      4   0.8300 -1.090         0.8940 6.53e-01  3.97e-01 2.58e+00
## 5      5   0.8970 -1.070         0.9420 2.85e-01  1.48e-01 1.34e+00
## 6      6   0.8970 -1.050         0.9570 1.33e-01  6.02e-02 7.17e-01
## 7      7   0.9240 -1.060         0.9560 6.53e-02  2.62e-02 3.91e-01
## 8      8   0.9440 -1.040         0.9700 3.35e-02  1.06e-02 2.16e-01
## 9      9   0.1710 -2.070        -0.0377 1.78e-02  4.29e-03 1.21e-01
## 10    10   0.1670 -2.010        -0.0484 9.77e-03  1.87e-03 6.86e-02
## 11    11   0.1610 -1.900        -0.0620 5.50e-03  8.30e-04 3.92e-02
## 12    12   0.1770 -2.610        -0.0381 3.16e-03  3.73e-04 2.25e-02
## 13    13   0.1410 -2.230        -0.1030 1.85e-03  1.70e-04 1.32e-02
## 14    14   0.7960 -0.898         0.8180 1.11e-03  7.80e-05 8.34e-03
## 15    15   0.1060 -1.460        -0.1200 6.69e-04  3.61e-05 5.33e-03
## 16    16   0.7870 -0.865         0.8090 4.10e-04  1.69e-05 3.44e-03
## 17    17   0.8150 -0.870         0.8420 2.55e-04  7.74e-06 2.24e-03
## 18    18   0.0489 -0.916        -0.0662 1.60e-04  3.45e-06 1.48e-03
## 19    19   0.0823 -1.600        -0.0938 1.01e-04  1.54e-06 9.78e-04
## 20    20   0.1180 -1.840        -0.1320 6.43e-05  6.86e-07 6.52e-04
## 21    21   0.1890 -1.680         0.0398 4.13e-05  3.07e-07 4.37e-04
## 22    22   0.1900 -1.700         0.0396 2.67e-05  1.38e-07 2.94e-04
## 23    23   0.1960 -2.360         0.0109 1.73e-05  6.20e-08 1.98e-04
## 24    24   0.1940 -2.300         0.0247 1.13e-05  2.79e-08 1.34e-04
## 25    25   0.2140 -2.680         0.0094 7.41e-06  1.26e-08 9.13e-05
## 26    26   0.2210 -2.640         0.0197 4.87e-06  5.70e-09 6.21e-05
## 27    27   0.1920 -2.590        -0.0360 3.22e-06  2.58e-09 4.23e-05
## 28    28   0.2100 -2.430         0.0107 2.13e-06  1.17e-09 2.89e-05
## 29    29   0.2230 -2.780         0.0014 1.42e-06  5.31e-10 1.98e-05
## 30    30   0.2300 -2.740         0.0104 9.42e-07  2.40e-10 1.35e-05

Running WGCNA with power=5

# Best power
network = datExpr_redDim %>% t %>% blockwiseModules(power=best_power$powerEstimate, numericLabels=TRUE)
##      mergeCloseModules: less than two proper modules.
##       ..color levels are 0, 1
##       ..there is nothing to merge.

Cluster distribution

table(network$colors)
## 
##  0  1 
## 41 39
clusterings[['WGCNA']] = network$colors
names(clusterings[['WGCNA']]) = rownames(datExpr_redDim)

Using the original expression dataset, the \(R^2\) threshold is never achieved, getting closest at power 1, but still doesn’t manage to find any clusters within the data

best_power = datExpr %>% pickSoftThreshold(powerVector = 1:30)
##    Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1      1    0.298 666.0         0.1110    77.9      77.9   78.2
## 2      2    0.297 333.0         0.1090    76.7      76.9   77.3
## 3      3    0.297 222.0         0.1090    75.6      75.8   76.5
## 4      4    0.296 167.0         0.1070    74.6      74.8   75.7
## 5      5    0.298 134.0         0.1080    73.5      73.8   74.9
## 6      6    0.297 112.0         0.1070    72.4      72.8   74.2
## 7      7    0.297  95.5         0.1060    71.4      71.9   73.4
## 8      8    0.296  83.5         0.1050    70.4      70.9   72.6
## 9      9    0.295  74.2         0.1030    69.4      70.0   71.9
## 10    10    0.298  67.3         0.1050    68.4      69.0   71.1
## 11    11    0.298  61.2         0.1040    67.4      68.1   70.4
## 12    12    0.297  56.1         0.1020    66.5      67.2   69.6
## 13    13    0.292  51.2         0.0976    65.5      66.3   68.9
## 14    14    0.292  47.5         0.0967    64.6      65.4   68.2
## 15    15    0.291  44.3         0.0955    63.7      64.6   67.5
## 16    16    0.290  41.5         0.0941    62.8      63.7   66.8
## 17    17    0.289  39.1         0.0927    61.9      62.9   66.1
## 18    18    0.288  36.9         0.0914    61.1      62.1   65.4
## 19    19    0.288  34.9         0.0900    60.2      61.2   64.7
## 20    20    0.287  33.2         0.0886    59.4      60.4   64.0
## 21    21    0.286  31.7         0.0883    58.6      59.6   63.4
## 22    22    0.285  30.2         0.0870    57.7      58.9   62.7
## 23    23    0.285  28.9         0.0855    56.9      58.1   62.1
## 24    24    0.284  27.7         0.0849    56.1      57.3   61.4
## 25    25    0.282  26.8         0.0835    55.4      56.6   60.8
## 26    26    0.281  25.8         0.0820    54.6      55.9   60.2
## 27    27    0.280  24.7         0.0802    53.9      55.1   59.6
## 28    28    0.279  23.9         0.0787    53.1      54.4   58.9
## 29    29    0.279  23.0         0.0776    52.4      53.7   58.3
## 30    30    0.278  22.3         0.0762    51.7      53.0   57.7



Gaussian Mixture Models with hard thresholding

Points don’t seem to follow a Gaussian distribution no matter the number of clusters, chose 4 groups following the best k from K-means because the methods are similar

n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=50, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')

best_k = 3 # copying k-means best_k
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)

Plot of clusters with their centroids in gray

gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())
rm(wss, datExpr_k_means, h_clusts, cc_output, best_k, ICA_output, ICA_clusters_names, 
   signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network, dend_meta, 
   best_power, c, viridis_age_cols, create_viridis_dict, pca)



Compare clusterings

Using Adjusted Rand Index:

clusters_plus_phenotype = clusterings
clusters_plus_phenotype[['Region']] = datMeta$Brain_lobe
clusters_plus_phenotype[['Sex']] = datMeta$Sex
clusters_plus_phenotype[['Age']] = datMeta$Age
clusters_plus_phenotype[['Subject']] = datMeta$Subject_ID
clusters_plus_phenotype[['ASD']] = datMeta$Diagnosis_

cluster_sim = data.frame(matrix(nrow = length(clusters_plus_phenotype), ncol = length(clusters_plus_phenotype)))
for(i in 1:(length(clusters_plus_phenotype))){
  cluster1 = as.factor(clusters_plus_phenotype[[i]])
  for(j in (i):length(clusters_plus_phenotype)){
    cluster2 = as.factor(clusters_plus_phenotype[[j]])
    cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
  }
}
colnames(cluster_sim) = names(clusters_plus_phenotype)
rownames(cluster_sim) = colnames(cluster_sim)

cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none', 
          cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE, 
          cexRow = 1, cexCol = 1, margins = c(7,7))

rm(i, j, cluster1, cluster2, clusters_plus_phenotype, cluster_sim)



Scatter plots

plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
              mutate(ID = rownames(.),                            Subject_ID = datMeta$Subject_ID,
                     KMeans = as.factor(clusterings[['km']]),     Hierarchical = as.factor(clusterings[['hc']]),
                     Consensus = as.factor(clusterings[['cc']]),  ICA = as.factor(clusterings[['ICA_min']]),
                     GMM = as.factor(clusterings[['GMM']]),       WGCNA = as.factor(clusterings[['WGCNA']]),
                     Sex = as.factor(datMeta$Sex),                Region = as.factor(datMeta$Brain_lobe), 
                     Diagnosis = as.factor(datMeta$Diagnosis_),   Age = datMeta$Age)

Now, PC1 seems to separate samples by Diagnosis pretty well

selectable_scatter_plot(plot_points, plot_points)